#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017-11-10 11:17
# @Author  : zejin
# @File    : wiki.py

from bs4 import BeautifulSoup as sp
from urllib import parse
from urllib import request
import re
import pymysql.cursors

# user-agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36

# https://en.wikipedia.org/wiki/Main_Page

# authority:en.wikipedia.org



req = request.Request("https://en.wikipedia.org/wiki/Main_Page")

req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36")
req.add_header("Origin","http://en.wikipedia.org")


#getDate = parse.urlencode()

resp = request.urlopen(req)

# print(resp.read().decode("utf-8"))
context = resp.read().decode("utf-8")
soup = sp(context,"html.parser")

listurl = soup.find_all("a",href=re.compile(r"^/wiki/"))
for list in listurl :
    if not re.search("\.(jpg|JPG)$",list["href"]):
        # string 获取一个 get_text获取标签下的所有代码
     print(list.get_text()+"<-->"+"https://en.wikipedia.org"+list["href"])
    # 获取数据库连接
    connection = pymysql.connect(host='localhost',user='root',password='root',db='wikiurl',charset='utf8mb4')
    try:
        with connection.cursor() as cursor:
            # 注意标点符号非单引号
            sql = "insert into `urls`(`urlname`,`urlhref`) values (%s,%s)"
        #     sql = "insert into `urls`(`urlname`,`urlhref`) values ('a','b')"
            cursor.execute(sql,(list.get_text(),"https://en.wikipedia.org"+list["href"]))
            # cursor.execute(sql)
            connection.commit()
    finally:
        connection.close()





